In [1]:
#! pip install --upgrade ppscore
#! pip install --upgrade plotly
#! pip install --upgrade nbformat
#! pip install --upgrade pdpbox
#! pip install --upgrade shap
In [1]:
import gc, pickle
from copy import deepcopy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import ppscore as pps
import shap, optuna
from pdpbox import pdp
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.inspection import permutation_importance
rmse = lambda true, pred: mse(true, pred) ** 0.5
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.decomposition import PCA
In [2]:
SEED = 2

Optimization, Tuning, and Interpretation

Load Data

In [3]:
df = pd.read_pickle('berlin_housing_cleaned.pkl').drop(['url', 'security_deposit'], 1)
In [4]:
TARGET = 'rent'
VALID_SZ = 0.1
CAT_COLS = df.columns[df.dtypes == object].tolist()
CONT_COLS = [c for c in df.columns if c not in CAT_COLS and c != TARGET]
df = deepcopy(df[CAT_COLS + CONT_COLS + [TARGET]])

df.shape
Out[4]:
(1023, 14)
In [5]:
df.isnull().sum().sum()
Out[5]:
0
In [6]:
df.head()
Out[6]:
region condition furnishing heating_type energy_sources efficiency_class rooms year_construction space heating_costs parking_space renovated_date energy_requirement rent
0 mitte first_time_use none unknown unknown b 4.0 2019 117.20 0.000000 120.0 2019 108.364662 2659.00
1 kreuzberg first_time_use high quality unknown district unknown 1.0 2020 29.33 0.000000 0.0 2020 27.118904 1200.00
2 köpenick well_kept normal quality unknown district unknown 2.0 1997 83.61 90.000000 0.0 1997 77.306906 979.00
3 wilmersdorf well_kept none unknown unknown unknown 4.0 1900 171.18 63.878816 0.0 1900 158.275281 1830.22
4 kreuzberg first_time_use high quality unknown district unknown 2.0 2020 88.27 32.939497 0.0 2020 81.615604 2272.00

Predictive Power Score

In [7]:
pps_matrix = pps.matrix(df)
In [8]:
# heatmap
fig = go.Figure(
    data = go.Heatmap(
        z = pps_matrix.values,
        x = pps_matrix.index,
        y = pps_matrix.columns
    )
)
fig.update_layout(title_text = 'Predictive Power Score Heatmap')
fig.show()

Preprocess for Sklearn

In [9]:
for col in CAT_COLS:
    df[col] = LabelEncoder().fit_transform(df[col])
In [10]:
def get_train_test_split(df, x_cols, target, test_size, random_state=SEED):
    
    x = deepcopy(df[x_cols])
    y = deepcopy(df[[target]])
    
    return train_test_split(x, y, test_size=test_size, random_state=random_state)
In [11]:
train_x, valid_x, train_y, valid_y = get_train_test_split(
    df=df,
    x_cols=CONT_COLS + CAT_COLS,
    target=TARGET,
    test_size = VALID_SZ
)
In [12]:
train_x.shape, valid_x.shape, train_y.shape, valid_y.shape
Out[12]:
((920, 13), (103, 13), (920, 1), (103, 1))
In [13]:
train_y = deepcopy(train_y.values.ravel())
valid_y = deepcopy(valid_y.values.ravel())

Get Baseline & Initial Fit

In [14]:
def get_error(pred, true):
    print(f'RMSE: {rmse(true, pred)}')
    print(f'MAE:  {mae(true, pred)}')
In [15]:
# naive baseline (prediction is average of training data)
get_error(pred = np.array([np.mean(train_y) for _ in range(len(valid_y))]),
          true = valid_y)
RMSE: 945.6036152362188
MAE:  704.9453885605741
In [16]:
# naive baseline (prediction is median of training data)
get_error(pred = np.array([np.median(train_y) for _ in range(len(valid_y))]),
          true = valid_y)
RMSE: 971.6228490199381
MAE:  706.0349029126212

Fit RF With Defaults

In [17]:
model = RandomForestRegressor()
In [18]:
model.fit(train_x, train_y)
Out[18]:
RandomForestRegressor()
In [19]:
preds = model.predict(valid_x)
In [20]:
get_error(valid_y, preds)
RMSE: 356.59636895373575
MAE:  236.53923788304454

Tuning Random Forests

The three most important paramaters that should be tuned are:

  • The number of trees to grow (called n_estimators in scikit-learn)
  • The maximum depth each tree can grow to (called max_depth in scikit-learn)
  • The number of features that should be considered at each split (called max_features in scikit-learn)

Lets tune each feature separately below, starting with n_estimators:

In [21]:
def fit_and_eval(train_x, train_y, valid_x, valid_y,
                 n_estimators=100, max_features='auto', max_depth=None, n_jobs=-1):
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        n_jobs=n_jobs
    )
    
    model.fit(train_x, train_y)
    preds = model.predict(valid_x)
    
    return rmse(valid_y, preds)

def iter_hyperparam_values(train_x, train_y, valid_x, valid_y, test_param, test_param_values,
                           optimal_values=None, fit_iter=10):

    # store the error after each iteration
    errors = {'avg': [], 'std': [], 'min': []}
    
    hyperparams = {**(optimal_values if optimal_values else {})}

    # iterate through each of the paramater values and store the error
    for test_param_value in test_param_values:
        
        hyperparams[test_param] = test_param_value

        _errors = [fit_and_eval(train_x, train_y, valid_x, valid_y, **hyperparams) for _ in range(fit_iter)]

        avg_error = np.mean(_errors)
        std_error = np.std(_errors)
        min_error = min(_errors)

        errors['avg'].append(avg_error)
        errors['std'].append(std_error)
        errors['min'].append(min_error)

        print(f'With {test_param_value} {test_param}, Average Model Error of: {avg_error:.2f} (+/- {std_error:.2f})')
        
    return errors

def plot_errors(errors:dict, xaxis_title:str, norm_values:bool):

    plot_errors = deepcopy({
        error_metric: (
            MinMaxScaler().fit_transform(np.array(errors[error_metric]).reshape(-1, 1)).ravel()
            if norm_values else errors[error_metric]
        ) for error_metric in ['avg', 'std', 'min']
    })

    fig = go.Figure(data = [
        go.Scatter(x=list(test_param_values), y=plot_errors['avg'], line_shape='spline', name='Average Error'),
        go.Scatter(x=list(test_param_values), y=plot_errors['std'], line_shape='spline', name='Standard Deviation'),
        go.Scatter(x=list(test_param_values), y=plot_errors['min'], line_shape='spline', name='Best Fit'),
    ])
    
    fig.update_layout(xaxis_title=xaxis_title, yaxis_title='Metric Value')
    
    return fig
In [22]:
test_param        = 'n_estimators'
test_param_values = range(10, 310, 10)
optimal_values    = None
fit_iter          = 10

errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
                                test_param, test_param_values, optimal_values, fit_iter)
With 10 n_estimators, Average Model Error of: 371.86 (+/- 10.06)
With 20 n_estimators, Average Model Error of: 352.64 (+/- 15.92)
With 30 n_estimators, Average Model Error of: 350.05 (+/- 10.52)
With 40 n_estimators, Average Model Error of: 352.35 (+/- 7.69)
With 50 n_estimators, Average Model Error of: 354.65 (+/- 8.26)
With 60 n_estimators, Average Model Error of: 353.87 (+/- 6.35)
With 70 n_estimators, Average Model Error of: 356.74 (+/- 6.75)
With 80 n_estimators, Average Model Error of: 356.30 (+/- 2.66)
With 90 n_estimators, Average Model Error of: 353.01 (+/- 5.38)
With 100 n_estimators, Average Model Error of: 355.97 (+/- 4.18)
With 110 n_estimators, Average Model Error of: 354.20 (+/- 5.74)
With 120 n_estimators, Average Model Error of: 353.90 (+/- 3.72)
With 130 n_estimators, Average Model Error of: 350.96 (+/- 5.20)
With 140 n_estimators, Average Model Error of: 355.58 (+/- 2.99)
With 150 n_estimators, Average Model Error of: 351.94 (+/- 4.82)
With 160 n_estimators, Average Model Error of: 353.14 (+/- 3.67)
With 170 n_estimators, Average Model Error of: 356.02 (+/- 5.02)
With 180 n_estimators, Average Model Error of: 355.93 (+/- 4.26)
With 190 n_estimators, Average Model Error of: 349.78 (+/- 3.75)
With 200 n_estimators, Average Model Error of: 352.37 (+/- 4.74)
With 210 n_estimators, Average Model Error of: 354.31 (+/- 2.75)
With 220 n_estimators, Average Model Error of: 350.98 (+/- 3.56)
With 230 n_estimators, Average Model Error of: 352.87 (+/- 2.89)
With 240 n_estimators, Average Model Error of: 351.88 (+/- 3.40)
With 250 n_estimators, Average Model Error of: 354.44 (+/- 5.05)
With 260 n_estimators, Average Model Error of: 354.81 (+/- 3.41)
With 270 n_estimators, Average Model Error of: 351.86 (+/- 2.49)
With 280 n_estimators, Average Model Error of: 355.56 (+/- 3.36)
With 290 n_estimators, Average Model Error of: 351.49 (+/- 2.97)
With 300 n_estimators, Average Model Error of: 352.48 (+/- 2.17)
In [23]:
plot_errors(errors, xaxis_title='Number of Estimators', norm_values=True)

Set Optimal Value

In [24]:
#OPTIMAL_N_ESTIMATORS = ## ENTER OPTIMAL VALUE HERE

OPTIMAL_N_ESTIMATORS = 220

Now lets do the same thing with max_features:

In [25]:
test_param        = 'max_features'
test_param_values = ['auto', 'sqrt', 'log2']
optimal_values    = {'n_estimators': OPTIMAL_N_ESTIMATORS}
fit_iter          = 10
In [26]:
errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
                                test_param, test_param_values, optimal_values, fit_iter)
With auto max_features, Average Model Error of: 353.66 (+/- 3.25)
With sqrt max_features, Average Model Error of: 354.77 (+/- 5.42)
With log2 max_features, Average Model Error of: 354.93 (+/- 2.02)
In [27]:
plot_errors(errors, xaxis_title='Max Features', norm_values=True)

Set Optimal Value

In [28]:
OPTIMAL_MAX_FEATURES = 'auto'

Student Exercise: Find optimal value for max_depth

In [29]:
test_param        = 'max_depth'
test_param_values = range(2, 32, 2)
optimal_values    = {'n_estimators': OPTIMAL_N_ESTIMATORS, 'max_features': OPTIMAL_MAX_FEATURES}
fit_iter          = 10
In [30]:
errors = iter_hyperparam_values(train_x, train_y, valid_x, valid_y,
                                test_param, test_param_values, optimal_values, fit_iter)
With 2 max_depth, Average Model Error of: 490.86 (+/- 2.08)
With 4 max_depth, Average Model Error of: 417.36 (+/- 2.61)
With 6 max_depth, Average Model Error of: 384.81 (+/- 4.43)
With 8 max_depth, Average Model Error of: 365.44 (+/- 3.88)
With 10 max_depth, Average Model Error of: 355.85 (+/- 3.44)
With 12 max_depth, Average Model Error of: 353.44 (+/- 4.41)
With 14 max_depth, Average Model Error of: 353.73 (+/- 3.35)
With 16 max_depth, Average Model Error of: 354.19 (+/- 2.42)
With 18 max_depth, Average Model Error of: 354.49 (+/- 3.24)
With 20 max_depth, Average Model Error of: 351.88 (+/- 3.02)
With 22 max_depth, Average Model Error of: 353.53 (+/- 2.56)
With 24 max_depth, Average Model Error of: 354.37 (+/- 4.02)
With 26 max_depth, Average Model Error of: 352.33 (+/- 2.69)
With 28 max_depth, Average Model Error of: 354.53 (+/- 3.50)
With 30 max_depth, Average Model Error of: 355.09 (+/- 2.99)
In [31]:
plot_errors(errors, xaxis_title='Max Depth', norm_values=True)

Set Optimal Value

In [32]:
OPTIMAL_MAX_DEPTH = 28

And the best model is...

In [33]:
best_rf = RandomForestRegressor(n_estimators = OPTIMAL_N_ESTIMATORS,
                                 max_features = OPTIMAL_MAX_FEATURES,
                                 max_depth = OPTIMAL_MAX_DEPTH
                                )
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)

get_error(valid_y, preds)
RMSE: 348.59619850309633
MAE:  224.36252312081848

Congrats, you now know how to do "Hyperparameter Tuning"

For Random Forests, you can also tune:

  • bootstrap: Whether bootstrap samples are used when building trees.
  • max_leaf_nodes: Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.
  • min_impurity_decrease: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
  • min_samples_leaf: The minimum number of samples required to be at a leaf node.
  • min_samples_split: The minimum number of samples required to split an internal node.
  • min_weight_fraction_leaf: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.

That seems like a lot of work, is there an easier way?

Yup! Introducing Random Search & Grid Search...

Using a search space that you provide, sklearn will randomly sample parameter sets and evaluate their performance.

In [34]:
# Create the random grid
random_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'max_depth': [None, 1, 3, 5, 10, 15, 20, 25, 50],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'bootstrap': [True],
}

# calculate number of possible parameter combinations
for i, n in enumerate([len(v) for v in random_grid.values()]):
    if i+1 == 1: m = n
    else: m *= n
print(f'Number of Combinations: {m}')
Number of Combinations: 11880
In [ ]:
# use the random grid to search for best hyperparameters
rf_random = RandomizedSearchCV(
    estimator = RandomForestRegressor(), # the sklearn model to use
    param_distributions = random_grid,   # the search space (specified above)
    n_iter = 5000,                       # search across n combinations
    cv = 5,                              # k fold cross validation
    verbose = 2,                         # print out progress
    n_jobs = -1                          # how many CPU cores to use, if -1 use all available
)

# Fit the random search model
rf_random.fit(df[CONT_COLS + CAT_COLS].values, df[[TARGET]].values.ravel())
In [36]:
# run this to get best parameters in search
#rf_random_best_params = rf_random.best_params_
In [35]:
# best params from prev search
rf_random_best_params = {
    'n_estimators': 100,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 4,
    'max_depth': 20,
    'bootstrap': True,
    'n_jobs': -1,
}
In [36]:
# fit and predict
best_rf = RandomForestRegressor(**rf_random_best_params)
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)

get_error(valid_y, preds)
RMSE: 348.98280492519245
MAE:  237.01062526939086

Optional Exercise: Can you find a better hyperparameter comination?

In [ ]:
 

Similar to Random Search, only instead of randomly sampling, EVERY combination will be tested. This is useful as a final optimization.

Optional Exercise: Find the optimal parameters given your Random Search results

In [39]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [],
    'min_samples_split': [],
    'min_samples_leaf': [],
    'max_features': [],
    'max_depth': [],
    'bootstrap': [],
}

# init new rf instance
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf,
                           param_grid = param_grid,
                           cv = 5,
                           n_jobs = -1,
                           verbose = 2)
In [ ]:
grid_search.best_params_
In [ ]:
best_rf = RandomForestRegressor(**grid_search.best_params_)
best_rf.fit(train_x, train_y)
preds = best_rf.predict(valid_x)

get_error(valid_y, preds)

Now you really know how to do Hyperparameter Tuning

  • This is a very common workflow when tuning models
  • It takes some patience and some time to understand each parameter, but it's definitely worth it -- especially when you need to squeeze out as much accuracy as possible

Feature Importance

We have covered Feature Importance briefly already, but let's take a closer look at this super powerful technique.

In [37]:
def get_feature_importance(model, feature_names, categorical_cols):

    # compute feature importances
    imp = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending = False)

    # add feature type
    imp['feat_type'] = imp.feature.apply(lambda feat: 'categorical' if feat in categorical_cols else 'continuous')
    
    return imp.sort_values('importance', ascending = False)
In [38]:
imp = get_feature_importance(best_rf, train_x.columns.tolist(), categorical_cols=CAT_COLS)
In [39]:
px.bar(imp, x='feature', y='importance', color='feat_type')
In [40]:
imp.groupby('feat_type').mean()
Out[40]:
importance
feat_type
categorical 0.022483
continuous 0.123586

And why is this important..?

  • Dimension Reduction
  • Feature Engineering
  • Interpretability
In [41]:
def kfold_train_eval(df, features, k, model_params, return_last = False, n_iter = 3):
    
    df = df.sample(len(df)).reset_index(drop = True)
    x = df[features]
    y = df[[TARGET]]
    kf = KFold(n_splits = k)
    
    errors = {'rmse': [], 'mae': []}

    for train_idx, valid_idx in kf.split(x):
        
        # build train/valid split
        train_x, valid_x = x.loc[train_idx], x.loc[valid_idx]
        train_y, valid_y = y.loc[train_idx].values.ravel(), y.loc[valid_idx].values.ravel()
        
        for _ in range(n_iter):
            
            # fit & predict
            model = RandomForestRegressor(**model_params)
            model.fit(train_x, train_y)
            preds = model.predict(valid_x)

            # record error
            errors['rmse'].append(rmse(valid_y, preds))
            errors['mae'].append(mae(valid_y, preds))

    results = {metric: np.mean(score) for metric, score in errors.items()}
    
    if return_last: return results, model
    else:           return results
In [42]:
# kfold score before new features
kfold_train_eval(df,
                 features=CAT_COLS+CONT_COLS,
                 k = 10,
                 model_params = rf_random_best_params
                )
Out[42]:
{'rmse': 336.19424227523353, 'mae': 235.47176070660788}
In [43]:
# add additional features
df['space_per_room'] = df.space / df.rooms
df['years_since_renovation'] = df.renovated_date - df.year_construction
df['renovation_ratio'] = df.renovated_date / df.year_construction
df['energy_efficiency'] = df.energy_requirement / df.space

CONT_COLS += ['space_per_room', 'years_since_renovation', 'renovation_ratio', 'energy_efficiency']
In [44]:
# kfold eval with new features
results, model = kfold_train_eval(
    df,
    features=CAT_COLS + CONT_COLS,
    k=10,
    model_params=rf_random_best_params,
    return_last=True
)
results
Out[44]:
{'rmse': 335.516924255293, 'mae': 237.49906153036937}
In [45]:
imp = get_feature_importance(model, feature_names=CAT_COLS+CONT_COLS, categorical_cols=CAT_COLS)
In [46]:
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')

Exercise: Let's do some feature engineering

  • Create a new feature
  • Refit the model
    • Does accuracy increase?
    • How "important" is that feature?
In [47]:
## Student TODO
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Anything we're missing?

  • If you recall from the beginning of the dataset, instead of treating categorical variables as their appropriate type or in a potentially more appropriate encoding, we encoded them numerically.
    • As previously mentioned, this can be a problem because the model is treating them as a continuous variable (which suggests an order to the data), when they are actually discrete.

So what can we do? We have a few options:

  • Drop the features all together
  • Use a model that allows the category data type (LightGBM and CatBoost are popular choices)
  • Use One Hot Encoding
  • Use Mean Target Encoding
  • ADVANCED: Create an embedding for each of the categories and use the embeddings as features

Categorical Target Encoding

In [48]:
def target_encode_categorical_features(df, feature, target, encode_type):
    
    df = df.copy()
    
    if encode_type == 'mean':
        encoding_map = df.groupby(feature).mean()[target].to_dict()
    elif encode_type == 'median':
        encoding_map = df.groupby(feature).median()[target].to_dict()
    elif encode_type == 'std':
        encoding_map = df.groupby(feature).std()[target].to_dict()
    else:
        raise ValueError('encode_type must be one of the following: "mean", "median", or "std"')
    
    df[f'{feature}_{encode_type}'] = df[feature].apply(lambda category: encoding_map[category])
    
    return df
In [49]:
for c in CAT_COLS:
    df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='mean')
In [50]:
mean_encoded_cat_cols = [f'{c}_mean' for c in CAT_COLS]

# kfold eval with new features
results, model = kfold_train_eval(
    df,
    features=CONT_COLS+mean_encoded_cat_cols,
    k=10,
    model_params=rf_random_best_params,
    return_last=True
)
results
Out[50]:
{'rmse': 296.10808301238825, 'mae': 208.47085210682587}
In [51]:
imp = get_feature_importance(
    model=model,
    feature_names=CONT_COLS+mean_encoded_cat_cols,
    categorical_cols=mean_encoded_cat_cols
)
In [52]:
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')

Exercise: What happens when we add Median and Standard Deviation Encoding?

In [53]:
for c in CAT_COLS:
    df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='median')
    df = target_encode_categorical_features(df, feature=c, target=TARGET, encode_type='std')
    
df.drop(CAT_COLS, axis=1, inplace=True)

CAT_COLS = [f'{c}_{encode_type}' for encode_type in ['mean', 'median', 'std'] for c in CAT_COLS]

for c in CAT_COLS:
    if '_std' in c:
        df[c] = df[c].fillna(0.)
In [54]:
# kfold eval with new features
results, model = kfold_train_eval(
    df,
    features=CONT_COLS+CAT_COLS,
    k=10,
    model_params=rf_random_best_params,
    return_last=True
)
results
Out[54]:
{'rmse': 306.4737042125943, 'mae': 209.87944674574052}
In [55]:
imp = get_feature_importance(
    model=model,
    feature_names=CONT_COLS+CAT_COLS,
    categorical_cols=CAT_COLS
)
In [56]:
px.bar(imp, x = 'feature', y = 'importance', color = 'feat_type')

Are there other types of importance?

In [57]:
def get_best_fit(train_x, train_y, valid_x, valid_y, model_params, fit_iter, best_error=None):
    
    for _ in tqdm(range(fit_iter)):
        model = RandomForestRegressor(**model_params)
        model.fit(train_x, train_y)
        preds = model.predict(valid_x)
        error = rmse(valid_y, preds)

        if not best_error or error < best_error:
            best_error = deepcopy(error)
            best_model = deepcopy(model)
        
    print(f'Best error after {fit_iter} iterations: {best_error}')
    
    return best_model

def get_permutation_importance(model, valid_x, valid_y, n_repeats):
    
    permutation_importance_results = permutation_importance(model, valid_x, valid_y, n_repeats=n_repeats)
    
    return pd.DataFrame({
        'feature': valid_x.columns,
        'importance': permutation_importance_results['importances_mean'],
    })

def get_avg_feature_importance(df, features, categorical_cols,
                               model_params, n_iter, fit_iter, permutation_repeats):
    
    feature_importance_results = []
    
    for _ in range(n_iter):
    
        # rebuild train/valid with new features
        train_x, valid_x, train_y, valid_y = get_train_test_split(
            df, x_cols=features, target=TARGET, test_size=VALID_SZ, random_state=None)

        # find the "best fit" of the model, to be used to find permutation importance (you usually shouldn't do
        # this as it is considered overfitting the validation set, but for feature selection it can be helpful)
        model = get_best_fit(train_x, train_y, valid_x, valid_y, model_params, fit_iter=fit_iter)

        # get regular random forest importance
        imp = get_feature_importance(model, feature_names=train_x.columns.tolist(), categorical_cols=categorical_cols)

        # get permutation importance using sklearn, this takes a long time but the more "n_repeats" the better the results!
        permutation_imp = get_permutation_importance(model, valid_x, valid_y, n_repeats=permutation_repeats)
        
        # combine both importance dfs
        imp = imp.merge(permutation_imp, on='feature', suffixes = ('_rf', '_perm'))
        
        # save this result
        feature_importance_results.append(deepcopy(imp))
    
    # return the average permutation importance for each feature
    imp = deepcopy(pd.concat(feature_importance_results).groupby('feature').agg({
        'feat_type': 'first',
        'importance_rf': 'mean',
        'importance_perm': 'mean',
    }).reset_index())
    
    # normalize permuation importance (think of it as the percentage of importance)
    imp['importance_perm'] = imp['importance_perm'] / imp['importance_perm'].sum()
    
    # create weighted importance
    imp['weighted_imp'] = imp.importance_rf * np.log1p(imp.importance_perm)
    imp['weighted_imp'] = imp['weighted_imp'] / imp['weighted_imp'].sum()
    
    # reorg columns and sort by weighted importance
    return imp[['feature', 'feat_type', 'importance_rf', 'importance_perm', 'weighted_imp']
              ].sort_values('weighted_imp', ascending=False).reset_index(drop=True).copy(deep=True)

Get Permutation Importance (~30 minutes)

In [87]:
# # get average permutation importance across {n_iter} folds
# imp = get_avg_feature_importance(
#     df=df,
#     features=CONT_COLS+CAT_COLS,
#     categorical_cols=CAT_COLS,
#     model_params=rf_random_best_params,
#     n_iter=10,
#     fit_iter=100,
#     permutation_repeats=100
# )

# imp.to_pickle('feature_importance_w_permutation.pkl')
In [62]:
imp = pd.read_pickle('feature_importance_w_permutation.pkl')
In [89]:
px.bar(imp, x = 'feature', y = 'weighted_imp', color = 'feat_type')

What happens when we drop them?

Feature Selection Using Importance

In [63]:
most_important_features = imp[imp.weighted_imp > 0.001].feature.tolist()
In [64]:
kfold_train_eval(df, features=most_important_features, k=10, model_params=rf_random_best_params)
Out[64]:
{'rmse': 298.38146591161126, 'mae': 206.97065445321635}
In [65]:
errors = []
ranked_feats = imp.feature.tolist()

for i in tqdm(range(len(ranked_feats), 1, -1)):
    
    param_max_feats = rf_random_best_params['max_features']
    
    results = kfold_train_eval(
        df=df,
        features=ranked_feats[:i],
        k=10,
        model_params={**rf_random_best_params, 'max_features': min(param_max_feats, i)}
    )
    
    errors.append({
        'top_n': i,
        'rmse': results['rmse'],
        'mae': results['mae'],
    })

errors = pd.DataFrame(errors)

In [66]:
px.line(errors, x = 'top_n', y = 'rmse')

Optimal Feature Set

In [80]:
OPTIMAL_N_FEATURES = errors.loc[errors['rmse'].argmin(), 'top_n']
In [81]:
optimal_feature_set = ranked_feats[:OPTIMAL_N_FEATURES]
In [82]:
kfold_train_eval(df, features=optimal_feature_set, k=10, model_params=rf_random_best_params)
Out[82]:
{'rmse': 297.30807486551475, 'mae': 206.74365475685678}

Model Interpretation

In [70]:
train_x, valid_x, train_y, valid_y = get_train_test_split(df=df, x_cols=optimal_feature_set,
                                                          target=TARGET, test_size=VALID_SZ, random_state=None)
In [71]:
model = get_best_fit(train_x, train_y, valid_x, valid_y, rf_random_best_params, 500)
Best error after 500 iterations: 266.7035581135556
In [72]:
def plot_partial_dependence(feature):
    pdp_space = pdp.pdp_isolate(
        model=model,
        dataset=valid_x,
        model_features=optimal_feature_set,
        feature=feature
    )
    pdp.pdp_plot(pdp_space, feature.title());
In [73]:
plot_partial_dependence('space')
In [74]:
plot_partial_dependence('energy_requirement')
In [85]:
plot_partial_dependence('energy_efficiency')
In [75]:
plot_partial_dependence('rooms')
In [76]:
plot_partial_dependence('year_construction')
In [86]:
plot_partial_dependence('renovated_date')

Summary w/ SHAP Values

In [77]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(valid_x)
In [78]:
shap.summary_plot(shap_values, valid_x)
In [ ]:
 
In [ ]:
 
In [ ]: